1 Introduction

Here we will analyse our data using the fine-scale resolution approach (DADA2).

2 Load necessary packages, functions and data

2.1 Packages

R.version.string
## [1] "R version 3.4.3 (2017-11-30)"
library("tidyverse")
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1     ✔ purrr   0.2.4
## ✔ tibble  1.3.4     ✔ dplyr   0.7.4
## ✔ tidyr   0.7.2     ✔ stringr 1.2.0
## ✔ readr   1.1.1     ✔ forcats 0.2.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
packageVersion("tidyverse")
## [1] '1.2.1'
library("phyloseq")
packageVersion("phyloseq")
## [1] '1.22.3'
library("stringr")
packageVersion("stringr")
## [1] '1.2.0'
library("vegan")
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.4-4
packageVersion("vegan")
## [1] '2.4.4'
library(RColorBrewer)
packageVersion("RColorBrewer")
## [1] '1.1.2'

Set random seed so other can reproduce

set.seed(15062017)

2.2 Functions and plot themes

Custom function and themes for plotting

theme_pub <- function (base_size = 12, base_family = "") {
  theme_grey(base_size = base_size,
             base_family = base_family) %+replace%

    theme(# Set text size
      plot.title = element_text(size = 18),
      axis.title.x = element_text(size = 16),
      axis.title.y = element_text(size = 16,
                                  angle = 90),
      axis.text.x = element_text(size = 14,colour="black",element_text(margin = margin(5,0,0,0),
                                                                       angle = 90)),
      axis.text.y = element_text(size = 14,colour="black",element_text(margin = margin(0,5,0,0))),
      strip.text.y = element_text(size = 15,
                                  angle = -90),

      # Legend text
      legend.title = element_text(size = 15),
      legend.text = element_text(size = 15),
      legend.background = element_rect(fill = "transparent", colour=NA),

      # Configure lines and axes
      axis.ticks.x = element_line(colour = "black"),
      axis.ticks.y = element_line(colour = "black"),

      # Plot background
      panel.background = element_rect(fill = "transparent",colour=NA),
      panel.grid.major = element_blank(),
      panel.grid.minor = element_blank(),
      plot.background = element_rect(fill="transparent", colour=NA),

      # Facet labels       
      legend.key = element_blank(),
      strip.background = element_blank(),
       panel.border = element_blank(),
       strip.text.x = element_text(size=18,face="bold",margin=margin(0,0,5,0))
      )

}

# Create colour palette for taxonomy plotting

colours <- c("#a6cee3","#1f78b4","#b2df8a","#33a02c","#fb9a99",
             "#e31a1c","#fdbf6f","#ff7f00","#cab2d6","#6a3d9a",
             "#ffff99","#b15928","#8BC395","#919D5F","#EB494A",
             "#F79C5D","#DEB969")

# Figure out highest unclassified rank of an OTU/DSV
name_otus = function(ps) {
  
  otuName = as.character(tax_table(ps)[,"Genus"])
  otuName = str_split_fixed(otuName, "_", 2)[,1]
  names(otuName) = taxa_names(ps)
  
  for (level in c("Family", "Order", "Class", "Phylum","Kingdom")) {
    uncl = (otuName=="")
    if (!any(uncl)) break
    otuName[uncl] = as.character(tax_table(ps)[uncl,level])
    otuName[uncl] = str_split_fixed(otuName[uncl], "\\(", 2)[,1]
  }
  
  otuName = sub("_[1-9]$", "", otuName) # remove appendices like "_1"
  otuName = factor(otuName, unique(otuName))
  otuName = otuName[order(otuName)]
  
  return(otuName)
}

# Create table for taxonomy plotting 
taxplottable = function (ps) {
  
  # Add new column with highest unclassified taxonomic rank to ps object
  highest_unclassified_rank <- name_otus(ps) %>%
    data.frame(OTU=names(.), highest_unclassified_rank = .) %>%
    mutate(highest_unclassified_rank = as.character(highest_unclassified_rank))
  taxtable <- as.data.frame(tax_table(ps), stringsAsFactors=F) %>%
    mutate(OTU=row.names(tax_table(ps))) %>%
    left_join(highest_unclassified_rank) %>%
    select(-OTU)
  row.names(taxtable) <- row.names(tax_table(ps))
  tax_table(ps) <- as.matrix(taxtable)
  
  # Figure out top12 genera
  ps_genus <- tax_glom(ps, taxrank="highest_unclassified_rank") %>% # merge by highest_unclassified_rank
  transform_sample_counts(.,  function(OTU) OTU/sum(OTU)) # calculate relabun
  top12_otus <- names(sort(taxa_sums(ps_genus),TRUE)[1:12])
  top12_genera <- as.character(tax_table(ps_genus)[top12_otus,"highest_unclassified_rank"]) # get names
  
  # Calculate relative abundance
  ps_relabun <- transform_sample_counts(ps,  function(OTU) OTU/sum(OTU))

  # Convert phyloseq object into large dataframe and manipulate
  ps_relabun_df <- psmelt(ps_relabun) %>% 
    group_by(OTU, fermentation)  %>%
    filter(max(Abundance) > 0.01) %>% # filter out low abundant OTUs
    ungroup %>%
    filter(highest_unclassified_rank %in% top12_genera) # keep only top 12 genera
     

  return(ps_relabun_df)
}

# Create distance matrix and reformat for easy plotting
calculateDistMat <- function(phylo_obj){
  # Calculate distance matrix
  dm <- vegdist(otu_table(phylo_obj), method="bray")
  
  # Reformat the  to long format matrix
  m <- data.frame(t(combn(rownames(otu_table(phylo_obj)),2)), as.numeric(dm))
  names(m) <- c("sample1", "sample2", "distance")

  # Add factor 'Day'
  m$day_sample1 <- str_split_fixed(m$sample1,"_",4)[,2]
  m$day_sample2 <- str_split_fixed(m$sample2,"_",4)[,2]
  
  # change Day 28 to 30
  m$day_sample1 <- gsub("28","30",m$day_sample1)
  m$day_sample2 <- gsub("28","30",m$day_sample2)

  # Keep only samples from same day
  m <- m[which(m$day_sample1== m$day_sample2),]
  
  return(m)
}  

2.3 Data

The datasets are generated on our cluster with seperate scripts. Here we’ll be importing the results into R/Phyloseq.

dada2 <- readRDS("../datasets/dada2.RDS")

We still have to remove all non-bacterial sequences here (e.g. Chloroplast, mitochondria). Let’s remove these:

# non_bacterial
taxtable <- as.data.frame(tax_table(dada2)) 
non_bacterial <- taxtable$Kingdom!="Bacteria" | is.na(taxtable$Kingdom)
dada2 <- prune_taxa(!non_bacterial,dada2)
# Chloroplast and mitochondria
taxtable <- as.data.frame(tax_table(dada2)) 
chloroplast_mitochondria <- taxtable$Class=="Chloroplast" | taxtable$Family=="Mitochondria"
dada2 <- prune_taxa(!chloroplast_mitochondria,dada2)

dada2 <- prune_samples(sample_sums(dada2)>0,dada2)

Merge technical repeats and restore metadata

grouping_factor_dada2 = apply(sample_data(dada2)[,2:6], 1, paste0,collapse="_")
## Warning in class(X) <- NULL: Setting class(x) to NULL; result will no
## longer be an S4 object
psM = merge_samples(dada2, group=grouping_factor_dada2)
## Warning in asMethod(object): NAs introduced by coercion
## Warning in asMethod(object): NAs introduced by coercion
psM.sample_data = str_split_fixed(rownames(sample_data(psM)), pattern="_", n=ncol(sample_data(psM))-1)
rownames(psM.sample_data) = rownames(sample_data(psM))
colnames(psM.sample_data) = colnames(sample_data(psM))[2:ncol(sample_data(psM))]
psM.sample_data = data.frame(psM.sample_data)
sample_data(psM) = sample_data(psM.sample_data)

dada2 <- psM

Remove failed samples:

dada2 <- prune_samples(sample_data(dada2)$type%in%c("CJ1","CJ2","FP"),dada2)
# Remove failed D55 samples and failed CJ2 D00
dada2  <- prune_samples(sample_data(dada2)$Day!=55,dada2)
samplesD00_CJ2 <- sample_data(dada2)$type=="CJ2"&sample_data(dada2)$Day=="00"
dada2  <- prune_samples(!samplesD00_CJ2,dada2)
# Remove empty taxa
dada2 <- prune_taxa(taxa_sums(dada2) > 0, dada2)

Some details:

dada2
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 510 taxa and 182 samples ]
## sample_data() Sample Data:       [ 182 samples by 5 sample variables ]
## tax_table()   Taxonomy Table:    [ 510 taxa by 6 taxonomic ranks ]
# Plot total reads per sample and read distribution
readsumsdf = data.frame(nreads = sort(taxa_sums(dada2), TRUE), sorted = 1:ntaxa(dada2),
                        type = "DSVs")
readsumsdf = rbind(readsumsdf, data.frame(nreads = sort(sample_sums(dada2),TRUE), 
                                          sorted = 1:nsamples(dada2), type = "Samples"))
title = "Total number of reads"
p = ggplot(readsumsdf, aes(x = sorted, y = nreads)) + geom_bar(stat = "identity")
p + ggtitle(title) + scale_y_log10() + facet_wrap(~type, 1, scales = "free")

3 Analysis

3.1 Alpha diversity

richness_dada2<-  estimate_richness(dada2,measures=c('Observed','InvSimpson')) %>%
  mutate(Samplename = row.names(.)) %>%
  gather(.,key="DivType",value="Alphadiv", Observed, InvSimpson) %>%
  mutate(type = str_split_fixed(Samplename,"_",8)[,1]) %>%
  mutate(Day = as.integer(str_split_fixed(Samplename,"_",8)[,2])) %>%
  mutate(fermentation = str_split_fixed(Samplename,"_",8)[,5]) %>%
  mutate(DivType = factor(DivType,levels=c("Observed","InvSimpson"))) %>%
  mutate(type = str_replace_all(type, "CJ1", "LF1"),
         type = str_replace_all(type, "CJ2", "LF2"),
         type = str_replace_all(type, "FP", "HF")) %>%
  mutate(type = as_factor(type)) %>%
  mutate(type = fct_relevel(type, "HF", after = Inf))
## Warning in estimate_richness(dada2, measures = c("Observed", "InvSimpson")): The data you have provided does not have
## any singletons. This is highly suspicious. Results of richness
## estimates (for example) are probably unreliable, or wrong, if you have already
## trimmed low-abundance taxa from the data.
## 
## We recommended that you find the un-trimmed data and retry.

Plot:

Colours <- c("#1b9e77","#d95f02","#7570b3")
title <- "Alphadiversity \n"
p <- ggplot(richness_dada2,aes(x=Day, y=Alphadiv))
p + facet_wrap(~DivType,nrow=2, scales = "free_y") +
  stat_summary(aes(group=fermentation, colour=type, alpha=type), fun.y=mean, geom="line") +
  geom_point(data=richness_dada2[richness_dada2$type=="FP",],aes(colour=type), size=2,alpha=0.8) +
  geom_point(data=richness_dada2[richness_dada2$type!="FP",], aes(colour=type), size=2,alpha=1) +
  scale_color_manual(values=Colours) +
  scale_alpha_manual(values = c(1,1,0.4)) + 
  scale_x_continuous(limits = c(0,56),breaks=seq(0,60,10),expand=c(0.01,0)) +
  theme_pub() + 
  theme(legend.position= 'bottom', 
        legend.title = element_blank(),
        axis.text.x=element_text(margin=margin(5,0,0,0)),
        panel.grid.major.y = element_line(colour = "grey")) +
  ggtitle(title) +
  ylab("Alpha diversity measure \n") +
  xlab("\n Time (Days)") +
  expand_limits(y=0)

ggsave(filename = "/mnt/H/Drafts/Sander/Fermentatie/Supplementary/SupFigure1.tiff",
         width = 178,
         height = 210,
         units = "mm",
         dpi = 300)

3.2 Taxonomic composition

# select lab fermentations only
dada2_labferm <- prune_samples(sample_data(dada2)$type=="CJ1"|sample_data(dada2)$type=="CJ2",dada2)

# Remove empty taxa
dada2_labferm <- prune_taxa(taxa_sums(dada2_labferm) > 0, dada2_labferm)

# Calculate relative abundance and prepare data for plotting
dada2_labferm_plot <- taxplottable(dada2_labferm)
## Joining, by = "OTU"
## Warning: Column `OTU` joining character vector and factor, coercing into
## character vector

Plot lab fermentations first to see how similar biological repeats are.

# Plot using ggplot2
ggplot(dada2_labferm_plot, aes(x=Biol_repeat, y=Abundance, fill=highest_unclassified_rank)) + geom_bar(stat = "identity", color = "black") + 
  ylab("Percentage of Sequences \n") + 
  xlab("") +
  theme_pub() +
  theme(axis.text.x= element_blank(), axis.ticks.x=element_blank(), legend.position="bottom") +
  labs(fill="") + 
  facet_grid(fermentation~Day,scales="free",drop=T) +
  scale_fill_manual(values = colours) +
  scale_y_continuous(expand=c(0,0)) + 
  guides(fill=guide_legend(byrow=F,ncol=2)) 

The biological repeats look very similar. Let’s calculate their mean abundance per day per OTU per fermentation and start parsing the data for plotting.

# Calculate relative abundance and prepare data for plotting
dada2_plot <- taxplottable(dada2)
## Joining, by = "OTU"
## Warning: Column `OTU` joining character vector and factor, coercing into
## character vector
# Caluclate mean abundance per OTU per Day per fermentation
dada2_plot_mean <- dada2_plot %>%
  group_by(OTU, Day, fermentation) %>%
  summarise(meanAbundance= mean(Abundance))

# Refine the data for plotting
dada2_plot_mean_metadata <- dada2_plot %>%
  select(-Biol_repeat,-Sample, -Abundance) %>% 
  left_join(dada2_plot_mean) %>% # Add other metadata again
  distinct() %>%
  mutate(Day = as.numeric(as.character(Day)), meanAbundance = meanAbundance * 100)  # Change Day to numeric and multiply abundance by 100
## Joining, by = c("OTU", "Day", "fermentation")
# Relevel highest_unclassified_rank to match colours of main genera with other plots
common_taxa <- c("Leuconostoc", "Lactobacillus", "Lactococcus", "Weissella", "Enterobacteriaceae", "Pectobacterium", "Pseudomonas") 
additional_taxa <- dada2_plot_mean_metadata$highest_unclassified_rank %>%
  unique() %>%
  .[!. %in% common_taxa] %>%
  as.character()

Levels <- c(common_taxa,additional_taxa)

dada2_plot_mean_metadata$highest_unclassified_rank <- factor(dada2_plot_mean_metadata$highest_unclassified_rank, levels=Levels)

# Relevel so that same highest_unclassified_rank will be plotted together
order <- dada2_plot_mean_metadata[order(dada2_plot_mean_metadata$highest_unclassified_rank),] %>%
  pull(OTU) %>%
  unique()

dada2_plot_mean_metadata <- dada2_plot_mean_metadata %>%
  mutate(OTU_reorder = factor(OTU, levels = order))

Three fermentations did not show a complete sample collection. Let’s remove their other time points from the plot as well.

dada2_plot_mean_metadata <- dada2_plot_mean_metadata %>%
  filter(fermentation != "FP05", fermentation != "FP12", fermentation != "FP37")

As a final manipulation, we will be clustering our data according to their similarity on day 30. This will reorder the plot so that fermentations that have a similar outcome on D30 will be next to each other in the plot.

clust <- otu_table(dada2) %>%
  vegdist(.,meterhod = "bray") %>% # Calculate Bray-Curtis distance matrix
  hclust(method = "average") # Perform hierarchical clustering
  
clustOrder <- clust$labels[clust$order] 
# Select Day30 only
clustOrder <- clustOrder[which(str_split_fixed(clustOrder,"_",5)[,2] %in% c(30,28))]

# Abbreaviate sample name
clustOrder <- unique(str_split_fixed(clustOrder,"_",5)[,5])

# Relevel fermentation factor in dataset to follow the clustering order
dada2_plot_mean_metadata$fermentation <- factor(dada2_plot_mean_metadata$fermentation, level = clustOrder)

As a last thing we’ll change the name of the facet_grid headers to be more in line with the publication

altNamelabelsdf <- dada2_plot_mean_metadata %>%
  select(fermentation) %>%
  distinct %>%
  mutate(altName = gsub("FP","HF",fermentation)) %>%
  mutate(altName = gsub("CJ","LF", altName)) 

altNamelabels <- altNamelabelsdf$altName
names(altNamelabels) <- altNamelabelsdf$fermentation

Now let’s plot again!

dada2_plot_mean_metadata %>%
  ggplot(aes(x=Day, y=meanAbundance, group=OTU)) +
  xlab("Time (days)") +
  ylab("Relative abundance (%)") +
  geom_area(aes(fill=highest_unclassified_rank, group=OTU),colour="white", size=0.05) + 
  geom_vline(aes(xintercept=Day),colour='black', linetype='dotted', alpha= 0.7) +
  theme_pub() + 
  scale_fill_manual(values=colours) +
  expand_limits(y=0,x=0) + 
  scale_y_continuous(limits=c(0,100),breaks=c(0,50,100)) +
  scale_x_continuous(breaks=c(0,10,20,30,40,50)) +
  theme(strip.text.x = element_text(size=8,face="bold",margin=margin(0,0,1,0)),
        legend.position = "bottom",
        legend.text = element_text(size = 8, face = "italic"),
        legend.title=element_blank(), 
        panel.border = element_blank(),
        axis.text.x= element_text(size=7, margin=margin(2,0,0,0)),
        axis.text.y= element_text(size=7, margin=margin(0,0,0,2)),
        axis.title.x= element_text(size=12, margin = margin(2,0,0,0, "mm")),
        axis.title.y= element_text(size=12, margin = margin(0,2,0,0, "mm")),
        legend.margin = margin(0,0,0,0, "mm")) + 
  guides(fill=guide_legend(byrow = F,ncol=4)) +
  facet_wrap(~fermentation, ncol=5, scales = "free_x", labeller = labeller(fermentation = altNamelabels))

ggsave(filename = "/mnt/H/Drafts/Sander/Fermentatie/Paperpictures/Figure2.tiff",
         width = 178,
         height = 210,
         units = "mm",
         dpi = 300)

Let’s plot this again but coloured on genus level

ggplot(dada2_plot_mean_metadata,aes(x=Day, y=meanAbundance, group=OTU)) + 
  xlab("\n Day") +
  ylab("Relative abundance\n") +
  geom_area(aes(fill=Order, group=OTU),colour="white", size=0.2) + 
  geom_vline(aes(xintercept=Day),colour='black', linetype='dotted', alpha= 0.8) +
  theme_pub() + 
  scale_fill_manual(values=c('#1b9e77','#d95f02','#7570b3','#e7298a','#66a61e','#e6ab02')) +
  theme(legend.title=element_blank(), legend.text = element_text(face ="italic"),
        panel.border = element_blank(),
        axis.text.x= element_text(size=8, margin=margin(2,0,0,0)),
        axis.text.y= element_text(size=8, margin=margin(0,0,0,2)),
        legend.position="right") +
  expand_limits(y=0,x=0) + 
  scale_y_continuous(limits=c(0,100),breaks=c(0,50,100)) +
  scale_x_continuous(breaks=c(0,10,20,30,40,50)) +
  theme(plot.margin=unit(c(0,1,0,0),"cm")) + 
  guides(fill=guide_legend(byrow = F,ncol=1)) +
  facet_wrap(~fermentation, ncol=5, scales = "free_x", labeller = labeller(fermentation = altNamelabels))

3.2.1 Digging deeper in the ASVs

First let’s give every ASV an unique name.

# Code originated from Stijn Wittouck
# Rename taxa names
uniquename <- dada2_plot_mean_metadata %>% 
  select(OTU, highest_unclassified_rank) %>%
  distinct() %>% 
  group_by(highest_unclassified_rank) %>%
  mutate(n_taxa = n()) %>%
  mutate(taxon_number = ifelse(n_taxa > 1, as.character(1:n()), "")) %>%
  mutate(taxon_name = paste(highest_unclassified_rank, taxon_number, sep = " ")) %>%
  ungroup() %>%
  select(- highest_unclassified_rank, - n_taxa, - taxon_number)

# Merge df's and filter for only most abundant
dada2_plot_mean_metadata <- dada2_plot_mean_metadata %>%
  left_join(uniquename) 
## Joining, by = "OTU"
# Save file for phylogenetic placement for Stijn
dada2_plot_mean_metadata %>%
  select(OTU, Family, Genus, taxon_name) %>%
  distinct %>% 
  filter(Family %in% c('Lactobacillaceae', 'Leuconostocaceae')) %>%
  write_tsv("ASVs_CJ.tsv")

3.2.1.1 Lactobacillus

# Lactobacillsus
dada2_plot_mean_metadata_Lactobacillus <- dada2_plot_mean_metadata %>%
  filter(Genus == "Lactobacillus")

colourCount = length(unique(dada2_plot_mean_metadata_Lactobacillus$OTU))
getPalette = colorRampPalette(brewer.pal(9, "Set1"))

ggplot(dada2_plot_mean_metadata_Lactobacillus, aes(x=Day, y=meanAbundance, group=OTU)) + 
  xlab("Time (days)") +
  ylab("Relative abundance (%)") +
  geom_area(aes(fill=taxon_name, group=OTU),colour="white", size=0.05) + 
  geom_vline(aes(xintercept=Day),colour='black', linetype='dotted', alpha= 0.7) +
  theme_pub() + 
  scale_fill_manual(values = getPalette(colourCount)) +
  expand_limits(y=0,x=0) + 
  scale_x_continuous(breaks=c(0,10,20,30,40,50)) +
  theme(strip.text.x = element_text(size=8,face="bold",margin=margin(0,0,1,0)),
        legend.position = "bottom",
        legend.text = element_text(size = 8, face = "italic"),
        legend.title=element_blank(), 
        panel.border = element_blank(),
        axis.text.x= element_text(size=7, margin=margin(2,0,0,0)),
        axis.text.y= element_text(size=7, margin=margin(0,0,0,2)),
        axis.title.x= element_text(size=12, margin = margin(2,0,0,0, "mm")),
        axis.title.y= element_text(size=12, margin = margin(0,2,0,0, "mm")),
        legend.margin = margin(0,0,0,0, "mm")) + 
  guides(fill=guide_legend(byrow = F,ncol=4)) +
  facet_wrap(~fermentation, ncol=5, scales = "free_x", labeller = labeller(fermentation = altNamelabels))

ggsave(filename = "/mnt/H/Drafts/Sander/Fermentatie/Supplementary/SupFigure3.tiff",
         width = 178,
         height = 210,
         units = "mm",
         dpi = 300)

3.2.1.2 Leuconostoc

# Leuconostoc
dada2_plot_mean_metadata_Leuconostoc <- dada2_plot_mean_metadata %>%
  filter(Genus == "Leuconostoc")

colourCount = length(unique(dada2_plot_mean_metadata_Leuconostoc$OTU))
getPalette = colorRampPalette(brewer.pal(9, "Set1"))

ggplot(dada2_plot_mean_metadata_Leuconostoc, aes(x=Day, y=meanAbundance, group=OTU)) + 
  xlab("Time (days)") +
  ylab("Relative abundance (%)") +
  geom_area(aes(fill=taxon_name, group=OTU),colour="white", size=0.05) + 
  geom_vline(aes(xintercept=Day),colour='black', linetype='dotted', alpha= 0.7) +
  theme_pub() + 
  scale_fill_manual(values = getPalette(colourCount)) +
  expand_limits(y=0,x=0) + 
  scale_x_continuous(breaks=c(0,10,20,30,40,50)) +
  theme(strip.text.x = element_text(size=8,face="bold",margin=margin(0,0,1,0)),
        legend.position = "bottom",
        legend.text = element_text(size = 8, face = "italic"),
        legend.title=element_blank(), 
        panel.border = element_blank(),
        axis.text.x= element_text(size=7, margin=margin(2,0,0,0)),
        axis.text.y= element_text(size=7, margin=margin(0,0,0,2)),
        axis.title.x= element_text(size=12, margin = margin(2,0,0,0, "mm")),
        axis.title.y= element_text(size=12, margin = margin(0,2,0,0, "mm")),
        legend.margin = margin(0,0,0,0, "mm")) + 
  guides(fill=guide_legend(byrow = F,ncol=4)) +
  facet_wrap(~fermentation, ncol=5, scales = "free_x", labeller = labeller(fermentation = altNamelabels))

ggsave(filename = "/mnt/H/Drafts/Sander/Fermentatie/Supplementary/SupFigure2.tiff",
         width = 178,
         height = 210,
         units = "mm",
         dpi = 300)

3.2.2 Correlation between ASVs

Let’s see which ASVs are correlated. Meaning which ASVs can be found in the same fermentation. The might come from the same organism containing multiple 16S copies.

dada2_otu <- as.data.frame(otu_table(dada2))
dada2_otu$samples <- row.names(dada2_otu)

# Data clean up
dada2_otu <- dada2_otu %>% 
  filter(!str_detect(samples,"FP05"),  # filter incomplete fermentations 
         !str_detect(samples,"FP12"),
         !str_detect(samples,"FP37"))  %>%
  mutate(fermentation = str_extract(samples, "[^_]+$")) %>% # Extract metadata
  mutate(Day = as.integer(ifelse(fermentation %in% c("CJ1", "CJ2"), str_sub(samples, 5,6), str_sub(samples,4,5)))) %>% 
  filter(Day %in% c(1, 3, 30, 28)) %>% # filter to keep day 1, 3 and 30
  gather(key = "ASV" , value = "abundance", -samples, -fermentation, -Day) %>%
  group_by(fermentation, ASV) %>%
  summarise(abundance = sum(abundance)) %>% # sum all ASVs belonging to same fermentation
  rename(OTU = ASV) %>% # Add unqiue taxon name
  left_join(dada2_plot_mean_metadata[,c("OTU", "taxon_name")]) %>% # This step also filters on top12 most abundant genera
  distinct() %>%
  filter(!is.na(taxon_name)) %>%
  select(-OTU) %>%
  spread(fermentation, abundance) %>% # remove empty rows
  mutate(sumVar = rowSums(.[2:length(colnames(.))])) %>%
  filter(sumVar > 0) %>%
  select(-sumVar)
## Joining, by = "OTU"
# Convert to presense absence
dada2_otu <- as.data.frame(dada2_otu)
row.names(dada2_otu) <- dada2_otu$taxon_name
dada2_otu <- dada2_otu[,-1]
dada2_otu[dada2_otu > 0] <- 1
dada2_otu <- t(dada2_otu)

# Filter out de ASVs present in every fermentation
dada2_otu <- dada2_otu[,!colSums(dada2_otu) == length(rownames(dada2_otu))]

# Calculate correlation matrix, make tidy, plot
corr <- as.data.frame(cor(dada2_otu))
corr$ASV1 <- row.names(corr)
corr %>% 
  gather(key = "ASV2", value = "corr", -ASV1) %>%
  filter(!is.na(corr)) %>%
  ggplot(aes(x= ASV1, y = ASV2)) + 
  geom_raster(aes(fill = corr)) +
  scale_fill_gradient(low="grey90", high="red") + 
  scale_y_discrete(position = "right") +
  theme(axis.text.x = element_text(size = 8 , angle = 90, hjust = 1, vjust= 0.5, face = "italic"),
        axis.text.y = element_text(size = 8 , hjust = 1, vjust= 0.5, face = "italic"),
        legend.position = "bottom",
        legend.title = element_blank(),
        axis.title = element_blank())

Lactobacillus 8, 9 and Lactobacillus 11 seems to bee highly correlated. Read: they are found together in all fermentations possibly indication 16S sequencing variants of one single strain. However, they are only found in one single fermentation. Hence, the high correlation. This is not the case for Enterobacteriaceae 2 and Pectobacterium. They are found in multiple fermentations!

3.3 Some numbers and percentages for paper

We want to know the percentage of samples that have certain genera.

3.3.1 Function for calculating the stats

calculate_statistics = function(dataframe, genus) {
  
  dataframe %>%
  filter(highest_unclassified_rank == genus) %>%
  group_by(fermentation) %>%
  summarise(maxAb = max(meanAbundance)) %>%
  filter(maxAb > 0) %>%
  summarise(highest_unclassified_rank = genus, Percentage = n()/40*100, MaxAbundance = max(maxAb))
  
}

Generate stats

stats=NULL
for (genus in unique(dada2_plot_mean_metadata$highest_unclassified_rank)){
  object <- calculate_statistics(dada2_plot_mean_metadata,genus)
  stats <- rbind(stats,object)
}

stats
## # A tibble: 12 x 3
##    highest_unclassified_rank Percentage MaxAbundance
##                        <chr>      <dbl>        <dbl>
##  1             Lactobacillus       97.5    70.554675
##  2                Klebsiella      100.0    68.000000
##  3                  Yersinia      100.0    53.540732
##  4                 Ewingella       97.5    51.617964
##  5                   Pantoea       90.0    48.774300
##  6               Leuconostoc      100.0    51.528661
##  7               Lactococcus       67.5    49.634496
##  8                 Weissella       62.5    39.947090
##  9            Pectobacterium       27.5    36.308775
## 10               Citrobacter       97.5    35.852584
## 11               Pseudomonas       17.5    11.518325
## 12        Enterobacteriaceae       82.5     8.923885

3.3.2 Count per ASV

lactobacillus_dada2 <- dada2_plot_mean_metadata %>%
  filter(Genus == "Lactobacillus") %>%
  select(OTU, taxon_name, fermentation, meanAbundance) %>%
  group_by(taxon_name, fermentation) %>%
  mutate(maxAbundance = max(meanAbundance)) %>%
  select(-meanAbundance) %>%
  distinct() %>%
  group_by(taxon_name, OTU) %>%
  summarise(count = n(), max_Abundance = max(maxAbundance), min_max_Abundance = min(maxAbundance))

write_tsv(lactobacillus_dada2, "lactobacillus_dada2.tsv")

lactobacillus_dada2 %>%
  select(-OTU)
## # A tibble: 20 x 4
## # Groups:   taxon_name [20]
##          taxon_name count max_Abundance min_max_Abundance
##               <chr> <int>         <dbl>             <dbl>
##  1  Lactobacillus 1    23    70.5546751         1.0218830
##  2 Lactobacillus 10     2    13.1437235         3.1483016
##  3 Lactobacillus 11     1     9.9379635         9.9379635
##  4 Lactobacillus 12     1     9.6637584         9.6637584
##  5 Lactobacillus 13     1     8.6910558         8.6910558
##  6 Lactobacillus 14     1     7.9048853         7.9048853
##  7 Lactobacillus 15     1     2.6083120         2.6083120
##  8 Lactobacillus 16     3     6.1393681         1.8676036
##  9 Lactobacillus 17     1     1.3020701         1.3020701
## 10 Lactobacillus 18     1     0.8866784         0.8866784
## 11 Lactobacillus 19     1     1.6904129         1.6904129
## 12  Lactobacillus 2    23    63.0129494         1.4085556
## 13 Lactobacillus 20     1     0.3339171         0.3339171
## 14  Lactobacillus 3     9    50.5065829         1.0352536
## 15  Lactobacillus 4    20    35.7630156         1.1298507
## 16  Lactobacillus 5     7    32.9156188         1.5781950
## 17  Lactobacillus 6     6     9.4302922         2.8630859
## 18  Lactobacillus 7     1    23.6236543        23.6236543
## 19  Lactobacillus 8     1    21.1833411        21.1833411
## 20  Lactobacillus 9     1    13.7894801        13.7894801
leuconostoc_dada2 <- dada2_plot_mean_metadata %>%
  filter(Genus == "Leuconostoc") %>%
  select(OTU, taxon_name, fermentation, meanAbundance) %>%
  group_by(taxon_name, fermentation) %>%
  mutate(maxAbundance = max(meanAbundance)) %>%
  select(-meanAbundance) %>%
  distinct() %>%
  group_by(taxon_name, OTU) %>%
  summarise(count = n(), max_bundance = max(maxAbundance), min_max_Abundance = min(maxAbundance))

write_tsv(leuconostoc_dada2, "leuconostoc_dada2.tsv")


leuconostoc_dada2 %>%
  select(-OTU)
## # A tibble: 8 x 4
## # Groups:   taxon_name [8]
##      taxon_name count max_bundance min_max_Abundance
##           <chr> <int>        <dbl>             <dbl>
## 1 Leuconostoc 1    20    51.528661          1.240902
## 2 Leuconostoc 2    39    36.964249          2.824798
## 3 Leuconostoc 3    35    39.622642          1.003817
## 4 Leuconostoc 4     9    14.856249          1.217901
## 5 Leuconostoc 5    14    10.050622          1.532413
## 6 Leuconostoc 6     7     6.050905          1.060183
## 7 Leuconostoc 7     6     4.781671          1.020774
## 8 Leuconostoc 8     1     1.244204          1.244204

Plot together

enterobacteriaceae_dada2 <- dada2_plot_mean_metadata %>%
  filter(Family == "Enterobacteriaceae") %>%
  select(OTU, taxon_name, fermentation, meanAbundance) %>%
  group_by(taxon_name, fermentation) %>%
  mutate(maxAbundance = max(meanAbundance)) %>%
  select(-meanAbundance) %>%
  distinct() %>%
  group_by(taxon_name, OTU) %>%
  summarise(count = n(), maxAbundance = max(maxAbundance), species = "Enterobacteriaceae")

write_tsv(enterobacteriaceae_dada2, "enterobacteriaceae_dada2.tsv")


enterobacteriaceae_dada2 %>%
  select(-OTU)
## # A tibble: 10 x 4
## # Groups:   taxon_name [10]
##              taxon_name count maxAbundance            species
##                   <chr> <int>        <dbl>              <chr>
##  1         Citrobacter     39    35.852584 Enterobacteriaceae
##  2 Enterobacteriaceae 1    19     8.923885 Enterobacteriaceae
##  3 Enterobacteriaceae 2    30     7.403258 Enterobacteriaceae
##  4           Ewingella     39    51.617964 Enterobacteriaceae
##  5         Klebsiella 1    40    68.000000 Enterobacteriaceae
##  6         Klebsiella 2    12    47.746716 Enterobacteriaceae
##  7            Pantoea 1    36    48.774300 Enterobacteriaceae
##  8            Pantoea 2     2    10.070995 Enterobacteriaceae
##  9      Pectobacterium     11    36.308775 Enterobacteriaceae
## 10            Yersinia     40    53.540732 Enterobacteriaceae
rbind(lactobacillus_dada2, leuconostoc_dada2) %>%
  mutate(species = str_extract(taxon_name, "[A-z]*")) %>%
  rbind(enterobacteriaceae_dada2) %>%  
  mutate(species = factor(species, levels = c("Leuconostoc",
                                                 "Lactobacillus",
                                                 "Enterobacteriaceae"))) %>%
  ggplot(aes(y = reorder(taxon_name, count), x = count)) +
  geom_point() +
  facet_wrap(~species, nrow = 3, scales = 'free_y') +
  theme_bw() + 
  theme(strip.text = element_text(face = "italic"),
        axis.text.y = element_text(face = "italic")) +
  ylab("") + 
  xlab("Presence in different fermentations \n") 

3.4 Beta diversity between samples (Did not end up in the final manuscript)

One main question we have is whether the spontaneous fermentation are different/equal. We will split the fermentations up in their two main actors, mainly, Enterobacteriaceae and Lactobacillaceae/LAB. For this we will use only Day 1, Day 3 and Day30 as they are available in all fermentations.

3.4.1 Preprocessing the datasets

We need to to the following things to the dataset:

  • Merge Biological repeats for CJ1 and CJ2
  • Select Days 01, 03 and 28 for CJ1 and CJ2; select Days 01, 03 and 30 for FP
  • Split dataset in two: Enteros and Lactos
  • Remove samples and OTUs that did not survive these criteria
# Merge Biologic repeats CJ1 and CJ2
grouping_factor <- apply(sample_data(dada2)[,-4], 1, paste0,collapse="_")
## Warning in class(X) <- NULL: Setting class(x) to NULL; result will no
## longer be an S4 object
psM <- merge_samples(dada2, group=grouping_factor)
psM.sample_data <- str_split_fixed(rownames(sample_data(psM)), pattern="_", n=4)
rownames(psM.sample_data) <- rownames(sample_data(psM))
colnames(psM.sample_data) <- colnames(sample_data(psM))[c(1:3,5)]
psM.sample_data <- data.frame(psM.sample_data)
sample_data(psM) <- sample_data(psM.sample_data)

dada2_merged <- psM

# Calculate Relative abundance
dada2_merged_relabun <- transform_sample_counts(dada2_merged,  function(OTU) OTU/sum(OTU))

daysToSelect<- c("01","03","28","30")

# Split dataset in Enteros and select relevant days
enteros.dada2_merged_relabun <- dada2_merged_relabun %>%
  subset_taxa(., Order=="Enterobacteriales") %>%
  subset_samples(., Day%in%daysToSelect) %>%
  prune_taxa(taxa_sums(.) > 0, .) %>%
  prune_samples(sample_sums(.) > 0, .)

# Do same for lactos
lactos.dada2_merged_relabun <- dada2_merged_relabun %>%
  subset_taxa(., Order=="Lactobacillales") %>%
  subset_samples(., Day%in%daysToSelect) %>%
  prune_taxa(taxa_sums(.) > 0, .) %>%
  prune_samples(sample_sums(.) > 0, .)

cat("Amount of Enteros: ", ntaxa(enteros.dada2_merged_relabun))
## Amount of Enteros:  23
cat("Amount of Lactos: ", ntaxa(lactos.dada2_merged_relabun))
## Amount of Lactos:  48

3.4.2 PCoA

title <- "PCoA (Bray-Curtis distance) ENTEROS dada2 \n"
ordination <- ordinate(enteros.dada2_merged_relabun,"PCoA","bray")
p <- plot_ordination(enteros.dada2_merged_relabun, ordination, color="Day") +
    geom_line(aes(group = fermentation), color="grey") 
# Remove layer of points to plot over jitter points
p$layers <- p$layers[-1]
p <- p + geom_point(size=3,alpha=0.8) + ggtitle(title)
p

title <- "PCoA (Bray-Curtis distance) LACTOS dada2 \n"
ordination <- ordinate(lactos.dada2_merged_relabun,"PCoA","bray")
p <- plot_ordination(lactos.dada2_merged_relabun, ordination, color="Day") +
    geom_line(aes(group = fermentation), color="grey") 
# Remove layer of points to plot over jitter points
p$layers <- p$layers[-1]
p <- p + geom_point(size=3,alpha=0.8) + ggtitle(title)
p

3.4.3 Plot

Let’s calculate the distance matrices and merge them.

dm_lactos_dada2 <- calculateDistMat(lactos.dada2_merged_relabun) %>%
  mutate(Order = "Lactobacillales", tool = "dada2")
dm_enteros_dada2 <- calculateDistMat(enteros.dada2_merged_relabun) %>%
  mutate(Order = "Enterobacteriales", tool = "dada2")


dm_merged <- rbind(dm_lactos_dada2,dm_enteros_dada2)

Start plotting

plot <- ggplot(dm_merged)
plot + geom_density(aes(x=distance, group=Order, colour=Order)) +
  geom_jitter(aes(x=distance, y=ifelse(Order=="Enterobacteriales",-1,-2), colour=Order),alpha= 0.7, size=0.5) +
  xlim(0,1.001) +
  scale_y_continuous(breaks=c(0,1,2)) +
  facet_grid(day_sample1 ~ tool) +
  theme_pub() +
  theme(legend.position = "bottom") 

Alternative plots

dm_merged_dada2 <- dm_merged %>% 
  filter(tool == "dada2")
  
dm_merged_transformed <- dm_merged_dada2 %>%
  group_by(day_sample1, Order) %>%
  do(data.frame(loc = density(.$distance)$x, dens = density(.$distance)$y))

# Flip and offset densities for the groups
dm_merged_transformed$dens <- ifelse(dm_merged_transformed$Order == "Enterobacteriales",
                                      dm_merged_transformed$dens * -1, 
                                      dm_merged_transformed$dens)

dm_merged_transformed$dens <- ifelse(dm_merged_transformed$day_sample1 == "01",
                                      dm_merged_transformed$dens, 
                                      ifelse(dm_merged_transformed$day_sample1 == "03",
                                             dm_merged_transformed$dens + 5,
                                             dm_merged_transformed$dens + 10))

# Plot
plot <- ggplot(dm_merged_transformed, aes(dens, loc, fill = Order, group = interaction(Order, day_sample1)))
plot  + geom_polygon() + 
  ylab("Bray-Curtis Distance") +
  xlab("Day") +
  scale_x_continuous(breaks = c(0,5,10), labels = c("01","03","30")) + 
  scale_y_continuous(breaks = c(0, 0.25, 0.50, 0.75, 1)) +
  theme_pub() + 
  theme(legend.position = "bottom")

Calculate per timepoint

dm_enteros_dada2 %>% 
  group_by(day_sample1) %>%
  summarise(mean=mean(distance), sd = sd(distance))
## # A tibble: 3 x 3
##   day_sample1      mean        sd
##         <chr>     <dbl>     <dbl>
## 1          01 0.4325657 0.1488455
## 2          03 0.4007639 0.1416422
## 3          30 0.5389011 0.1750159
dm_lactos_dada2 %>% 
  group_by(day_sample1) %>%
  summarise(mean=mean(distance), sd = sd(distance))
## # A tibble: 3 x 3
##   day_sample1      mean        sd
##         <chr>     <dbl>     <dbl>
## 1          01 0.6505199 0.2150153
## 2          03 0.5024399 0.2158668
## 3          30 0.7226945 0.1824628